c#1. Install packages
#install.packages('readstata13')
#install.packages('lars')
#install.packages("glmnet")
#install.packages("MASS")
#install.packages("Hmisc")


#2. Import data from STATA file (.dta) and data cleaning
rm(list=ls(all=TRUE))
library(readstata13)
data<-read.dta13("Q1-Q4 Rdata CU v2 weight.dta") ##the data is saved as 'Q1-Q4 Rdata CU v2.dta' in the share folder
#data.df<-data.frame(data)
datanew<-na.omit(data)
#datanew.df<-data.frame(datanew)
##2.1 assign "loge" as y
y<-datanew$loge
##2.2 assign all the other variables as x
x<-datanew[ , names(datanew)!="loge" & names(datanew)!="weight" & names(datanew)!="Electricity" & names(datanew)!="TotalExp"]
x<-data.matrix(x)

#3. Lasso model
##3.1 Lars packages
#fit.lasso<-lars(x, y, type = c("lasso"),  trace = FALSE, normalize = TRUE, intercept = TRUE, eps = .Machine$double.eps, use.Gram = TRUE)
#plot(fit.lasso,breaks=F,xvar='norm')
#cv.lasso <- cv.lars(x, y, type="lasso")
##3.2 glmnet packages for cross validation and variable selection
library(glmnet)
###3.2.1 coefficients - L1 Norm plot: each curve corresponds to a variable
#fit = glmnet(x, y)
#plot(fit)
###3.2.2 summary of the glmnet path at each step:
###      DF: The number of nonzero coefficients
###      %dev: the percent deviance explained
###      Lambda: the value of lambda
#print(fit)
###3.2.3 MSE - log(Lambda) plot: red dotted line is the cross-validation curve
set.seed(1)
cvfit = cv.glmnet(x, y,family="gaussian")
plot(cvfit)
###3.2.4 selected lambda in the above plot:
###      lambda.min is the value of lambda that gives minimum mean cross-validated error.
###      lambda.1se gives the most regularized model such that error is within one standard error of the minimum.
cvfit$lambda.min
cvfit$lambda.1se
###3.2.5 coefficients using lambda.min
coef<-coef(cvfit, s = "lambda.1se")
incvars<-as.matrix(coef)
incvars<-incvars[incvars!=0,]
incvars<-names(incvars)
x<-datanew[ , incvars[-1]]
ols<-lm(y~.,data=x)
ploge<-predict(ols)
rols=lm(y~x$logy)
results<-data.frame(logy=x$logy,loge=NA,ploge=NA,rmloge=NA,decile=NA)
results$ploge<-predict(rols)
results$loge<-y
results$rmloge<-results$ploge+ols$residuals

library("Hmisc")
TotalExp=exp(datanew$logy)
pct=wtd.quantile(TotalExp,weight=datanew$weight,probs=seq(from=0,to=1,by=0.1))
decile=cut(TotalExp,pct,labels=FALSE,right=FALSE,include.lowest=TRUE)
results$decile=as.factor(decile)
eshare=exp(datanew$loge-datanew$logy)
esharecont=exp(results$rmloge-datanew$logy)
longes=c(eshare,esharecont)
longdec=c(decile,decile)
longdata=c(rep(1,length(decile)),rep(2,length(decile)))
long<-data.frame(longes,longdec=as.factor(longdec),longdata=as.factor(longdata))
boxplot(longes~longdata*longdec,outline=FALSE,horizontal=TRUE)
require(ggplot2)
ylim1 = boxplot.stats(longes[longdec==1])$stats[c(1, 5)]
postscript(file="figure4.eps",width=1200,height=800,res=200)
ggplot(data=long,aes(x=longdec, y=longes,fill=longdata))+
  ylab("electricity share of total expenditures")+
  xlab("expenditure decile") +
  geom_boxplot()+
  theme(axis.title.y=element_text(vjust=-10)) +
  theme(plot.margin = unit(c(0.5,0.5,0.5,-.25), "cm"))+
#  geom_boxplot(outlier.shape=NA)+
#  coord_cartesian(ylim = ylim1*1.05)+
  coord_flip()+
  theme(legend.justification = c(1, 1),
        legend.position = c(.95,.95))+
  theme(legend.title = element_blank())+
  scale_x_discrete(labels=c("1"="(poorest) 1","10"="(richest) 10"))+
#  scale_fill_discrete(name="",labels,
#        c("raw data","with demo-\ngraphic controls"))
scale_fill_grey(start=0.5,end=0.8,name="",labels,
                c("raw data","with demo-\ngraphic controls"))
dev.off()
